{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fundamental-basketball",
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "copyrighted-enhancement",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH\n",
    "sys.path.append('/home/mink/lib/ai4eutils')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "golden-planning",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from collections import Counter, defaultdict\n",
    "from random import sample\n",
    "from shutil import copyfile\n",
    "from multiprocessing.pool import ThreadPool\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "\n",
    "import path_utils, sas_blob_utils  # ai4eutils\n",
    "\n",
    "from data_management.megadb.schema import sequences_schema_check\n",
    "from data_management.megadb.converters.cct_to_megadb import process_sequences"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "quiet-investigation",
   "metadata": {},
   "source": [
    "## BNF round 2\n",
    "\n",
    "Last time we only imported the 25k entries that were bbox labeled. This notebook imports the rest of them.\n",
    "\n",
    "The entires imported last time had no location - we shoud just delete these entries and ingest these instead, since we include all the existing bboxes anyways\n",
    "\n",
    "There is no sequence info. The location is best-effort: a few of the ones sharing the same prefix are likely the same location..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 241,
   "id": "noble-origin",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = 'bnf_20190624and0815'\n",
    "\n",
    "container_root = '/mink_disk_0/camtraps/bnf/bnf/'  \n",
    "\n",
    "path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_new.json' \n",
    "path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_new_temp.json' "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "blocked-albania",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25000"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# entries exported last time. No megadb \"id\" yet - need to query again to upsert to update fields\n",
    "with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/annotation_prep/bnf_20190624and0815_w_batch_10_boxes.json') as f:\n",
    "    bnf_db = json.load(f)\n",
    "len(bnf_db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "desirable-promise",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25000"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "entries_already_in_db = {}\n",
    "for seq in bnf_db:\n",
    "    for im in seq['images']:\n",
    "        entries_already_in_db[im['file']] = seq\n",
    "\n",
    "len(entries_already_in_db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "touched-wallace",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'dataset': 'bnf_20190624and0815',\n",
       "  'seq_id': 'dummy_9df472f5d93e40a5ba30ac49f877636d',\n",
       "  'images': [{'image_id': 14313,\n",
       "    'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/Camera traps ALL historical data/CAMERA TRAPS 2010/Camera Trap Locations 2010/Jelutong Edge 1/JE1 12-06-10/Cam N 11/CDY_0001.JPG',\n",
       "    'bbox': [{'category': 'person', 'bbox': [0.2733, 0.143, 0.2081, 0.68]}]}],\n",
       "  'class': ['__label_unavailable']}]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample(bnf_db, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "corrected-athletics",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "65"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "36"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "101"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# from CameraTraps/data_management/importers/bnf_to_json.ipynb\n",
    "bnf_valid_species = {\n",
    "    'argus',\n",
    "    'babbler brown bird',\n",
    "    'banded linsang',\n",
    "    'banded palm civet',\n",
    "    'banteng',\n",
    "    'bats',\n",
    "    'bay cat',\n",
    "    'bearded pig',\n",
    "    'binturong',\n",
    "    'bird',\n",
    "    'bornean ground cuckoo',\n",
    "    'bornean red muntja',\n",
    "    'brown wood owl',\n",
    "    'butterfly',\n",
    "    'camera placing shots',\n",
    "    'civet',\n",
    "    'clouded leopard',\n",
    "    'collared mongoose',\n",
    "    'common palm civet',\n",
    "    'crested fireback',\n",
    "    'deer',\n",
    "    'eagle',\n",
    "    'empty',\n",
    "    'fairy pitta',\n",
    "    'flat headed cat',\n",
    "    'gibbon',\n",
    "    'great argus',\n",
    "    'grey headed fish eagle',\n",
    "    'human',\n",
    "    'hunter dog',\n",
    "    'leopard cat',\n",
    "    'long-tailed macaque',\n",
    "    'macaque',\n",
    "    'malay civet',\n",
    "    'marbled cat',\n",
    "    'mongoose',\n",
    "    'monitor lizard',\n",
    "    'monkey',\n",
    "    'moon rat',\n",
    "    'mouse deer',\n",
    "    'muntjac',\n",
    "    'orangutan',\n",
    "    'otter civet',\n",
    "    'pangolin',\n",
    "    'pig',\n",
    "    'pig-tailed macaque',\n",
    "    'porcupine',\n",
    "    'primate',\n",
    "    'raptor',\n",
    "    'red langur',\n",
    "    'red leaf monkey',\n",
    "    'reptile',\n",
    "    'rodent',\n",
    "    'sambar deer',\n",
    "    'short-tailed mongoose',\n",
    "    'small-toothed palm civet',\n",
    "    'squirrel',\n",
    "    \"storm's stork\",\n",
    "    'sun bear',\n",
    "    'treeshrew',\n",
    "    'turtle',\n",
    "    'unknown',\n",
    "    'white headed fish eagle',\n",
    "    'yellow muntjac',\n",
    "    'yellow throated marten'\n",
    "}\n",
    "\n",
    "cat_map = {\n",
    "    'nothing': 'empty',\n",
    "    \"strom's strok\": \"storm's stork\",\n",
    "    \"storms stork\": \"storm's stork\",\n",
    "    \"storms st\": \"storm's stork\",\n",
    "    \"storm stork\": \"storm's stork\", \n",
    "    'pig-t-macaque': 'pig-tailed macaque',\n",
    "    \"unclear\": \"unknown\",\n",
    "    'camera set-up': 'human',\n",
    "    'humans': 'human',\n",
    "    'orang-utans': 'orangutan',\n",
    "    'orang utan': 'orangutan',\n",
    "    'sunbear': 'sun bear',\n",
    "    'marble cat': 'marbled cat',\n",
    "    'flat-headed-cat': 'flat headed cat',\n",
    "    'dog hunter': 'hunter dog',\n",
    "    'small thoodhed palm civet': 'small toothed palm civet',\n",
    "    'short tailed-mongoose': 'short-tailed mongoose',\n",
    "    'short taled-mongoose': 'short-tailed mongoose',\n",
    "    'short-t mongoose': 'short-tailed mongoose',\n",
    "    'short-t-monggoose': 'short-tailed mongoose',\n",
    "    'short tailed mongoose': 'short-tailed mongoose',\n",
    "    'short-t-mongoose': 'short-tailed mongoose',\n",
    "    'short-tailed-mongoose': 'short-tailed mongoose',\n",
    "    's-t mongoose': 'short-tailed mongoose',\n",
    "    'short - tailed mongoose': 'short-tailed mongoose',\n",
    "    'short tailet mongoose': 'short-tailed mongoose',\n",
    "    'mongooe': 'mongoose',\n",
    "    'reed-leaf monkey': 'red leaf monkey',\n",
    "    'otters civet': 'otter civet',\n",
    "    'small toothed palm civet': 'small-toothed palm civet', \n",
    "    'st palm civet': 'small-toothed palm civet', \n",
    "    'small thoodhed palm civet': 'small-toothed palm civet', \n",
    "    'small thoothed palm civet': 'small-toothed palm civet', \n",
    "    'small thoodheed palm civet': 'small-toothed palm civet',\n",
    "    'small-thoodhed palm civet': 'small-toothed palm civet',\n",
    "    'small-toodhed palm civet': 'small-toothed palm civet',\n",
    "    'collard mongoose': 'collared mongoose'\n",
    "}\n",
    "\n",
    "len(bnf_valid_species)\n",
    "len(cat_map)\n",
    "combined_species_names = bnf_valid_species | set(cat_map.keys())\n",
    "len(combined_species_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "chubby-nurse",
   "metadata": {},
   "outputs": [],
   "source": [
    "input_container_sas = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "satisfactory-portable",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "0it [00:00, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "listing blobs...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "326797it [02:53, 1887.80it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Enumerated 279638 matching blobs out of 326797 total\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "image_paths = sas_blob_utils.list_blobs_in_container(\n",
    "    container_uri=input_container_sas,\n",
    "    blob_suffix=('.jpg', '.jpeg', '.png'),  # check will be case-insensitive\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "french-blackberry",
   "metadata": {},
   "source": [
    "### 20190815cameratraps folder\n",
    "\n",
    "No species label but somewhat neat location... Location still has issues for paths e.g.\n",
    "\n",
    "Sometimes the date is not correct - in 2015 folder but the date on other folder indicates another year."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "id": "hydraulic-efficiency",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'02-05-13'"
      ]
     },
     "execution_count": 144,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "'Km2 x Railway'"
      ]
     },
     "execution_count": 144,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "location_date = 'Km2 x Railway 02-05-13'\n",
    "date = location_date.split(' ')[-1]\n",
    "location = location_date.split(date)[0].strip()\n",
    "date\n",
    "location"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 209,
   "id": "rapid-concrete",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 279638/279638 [00:00<00:00, 886586.67it/s] \n"
     ]
    }
   ],
   "source": [
    "folder = '201908'\n",
    "\n",
    "seq_08 = []\n",
    "locations_08 = set()\n",
    "\n",
    "years_w_loc_date = ['2013', '2014', '2015', '2016', '2017']\n",
    "\n",
    "for image_path in tqdm(image_paths):\n",
    "    if not image_path.startswith(folder):\n",
    "        continue\n",
    "        \n",
    "    bbox = None\n",
    "    clss = None\n",
    "    if image_path in entries_already_in_db:\n",
    "        bbox = entries_already_in_db[image_path]['images'][0]['bbox']\n",
    "        clss = entries_already_in_db[image_path]['class']\n",
    "        \n",
    "    p_parts = image_path.split('/')\n",
    "    year = p_parts[2]\n",
    "#     if year in years_w_loc_date:\n",
    "#         location_date = p_parts[-3]\n",
    "#         if location_date.startswith('Cam M'):\n",
    "#             location_date = p_parts[-4] \n",
    "        \n",
    "#         date = location_date.split(' ')[-1]\n",
    "#         location = location_date.split(date)[0].strip()\n",
    "#     else:\n",
    "#         location = p_parts[-3]\n",
    "#         date = year\n",
    "        \n",
    "#     if location in ['', '2018', 'Cam']:\n",
    "#         location = 'unknown'\n",
    "#     locations_08.add(location)\n",
    "    \n",
    "    im = {\n",
    "        'file': image_path,\n",
    "        'frame_num': 1,\n",
    "    }\n",
    "    if bbox is not None:\n",
    "        im['bbox'] = bbox\n",
    "    \n",
    "    seq = {\n",
    "        'dataset': dataset_name,\n",
    "        'seq_id': f'dummy_{folder}_{len(seq_08)}',\n",
    "#         'location': 'unknown',\n",
    "        'datetime': year,\n",
    "        'images': [im]\n",
    "    }\n",
    "    if clss is not None:\n",
    "        seq['class'] = clss\n",
    "    else:\n",
    "        seq['class'] = ['__label_unavailable']\n",
    "        \n",
    "    seq_08.append(seq)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "id": "defined-childhood",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "40592"
      ]
     },
     "execution_count": 210,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(seq_08)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 211,
   "id": "liquid-closure",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'dataset': 'bnf_20190624and0815',\n",
       " 'seq_id': 'dummy_201908_25982',\n",
       " 'datetime': '2016',\n",
       " 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2016/C.Maret/T SC 610m 08-03-2016/Cam 2015-20/IMAG0059.JPG',\n",
       "   'frame_num': 1}],\n",
       " 'class': ['__label_unavailable']}"
      ]
     },
     "execution_count": 211,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "{'dataset': 'bnf_20190624and0815',\n",
       " 'seq_id': 'dummy_201908_28334',\n",
       " 'datetime': '2016',\n",
       " 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2016/H.Agustus/T2E x ORW 25-08-2016/Cam C27/Cdy00030.JPG',\n",
       "   'frame_num': 1}],\n",
       " 'class': ['__label_unavailable']}"
      ]
     },
     "execution_count": 211,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "{'dataset': 'bnf_20190624and0815',\n",
       " 'seq_id': 'dummy_201908_7029',\n",
       " 'datetime': '2013',\n",
       " 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2013/L.Cam Traps Desember 2013/T1A x Railway 13-12-13/Cam S9/Cdy00493.JPG',\n",
       "   'frame_num': 1}],\n",
       " 'class': ['__label_unavailable']}"
      ]
     },
     "execution_count": 211,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "{'dataset': 'bnf_20190624and0815',\n",
       " 'seq_id': 'dummy_201908_9760',\n",
       " 'datetime': '2014',\n",
       " 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2014/C.Cam Traps Maret 2014/Km3 x Railway 01-03-2014/Cam S5/Cdy00401.JPG',\n",
       "   'frame_num': 1}],\n",
       " 'class': ['__label_unavailable']}"
      ]
     },
     "execution_count": 211,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "{'dataset': 'bnf_20190624and0815',\n",
       " 'seq_id': 'dummy_201908_32795',\n",
       " 'datetime': '2018',\n",
       " 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2018/C.Maret/T SC x Canal D/Cam 2017-1/IMAG0102.JPG',\n",
       "   'frame_num': 1}],\n",
       " 'class': ['__label_unavailable']}"
      ]
     },
     "execution_count": 211,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "{'dataset': 'bnf_20190624and0815',\n",
       " 'seq_id': 'dummy_201908_35095',\n",
       " 'datetime': '2018',\n",
       " 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2018/G.Juli/T SC 1412m/Cam 2018-5/IMAG0059.JPG',\n",
       "   'frame_num': 1}],\n",
       " 'class': ['__label_unavailable']}"
      ]
     },
     "execution_count": 211,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "{'dataset': 'bnf_20190624and0815',\n",
       " 'seq_id': 'dummy_201908_33861',\n",
       " 'datetime': '2018',\n",
       " 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2018/E.Mei/T SC x Canal D/Cam 2017-3/T SC x Canal D Malay Civet.JPG',\n",
       "   'frame_num': 1,\n",
       "   'bbox': [{'category': 'animal',\n",
       "     'bbox': [0.1277, 0.7367, 0.5104, 0.1969]}]}],\n",
       " 'class': ['malay civet']}"
      ]
     },
     "execution_count": 211,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "{'dataset': 'bnf_20190624and0815',\n",
       " 'seq_id': 'dummy_201908_23350',\n",
       " 'datetime': '2015',\n",
       " 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2015/J. Oktober 2015/T 2E x ORW/Cam C27/Cdy00008.JPG',\n",
       "   'frame_num': 1}],\n",
       " 'class': ['__label_unavailable']}"
      ]
     },
     "execution_count": 211,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "{'dataset': 'bnf_20190624and0815',\n",
       " 'seq_id': 'dummy_201908_36475',\n",
       " 'datetime': '2018',\n",
       " 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2018/J.Oktober/Km4 x Railway/Cam 2018-1/IMAG0015.JPG',\n",
       "   'frame_num': 1}],\n",
       " 'class': ['__label_unavailable']}"
      ]
     },
     "execution_count": 211,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "{'dataset': 'bnf_20190624and0815',\n",
       " 'seq_id': 'dummy_201908_22885',\n",
       " 'datetime': '2015',\n",
       " 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2015/H. Agustus 2015/Tower 15-08-2015/CamC30/Cdy00030.JPG',\n",
       "   'frame_num': 1}],\n",
       " 'class': ['__label_unavailable']}"
      ]
     },
     "execution_count": 211,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# seq_08[21785]\n",
    "for s in sample(seq_08, 10):\n",
    "    s"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "smaller-interest",
   "metadata": {},
   "source": [
    "### 20190624cameratraps folder\n",
    "Only some folders have location / species, but the location is hard to parse consistently with the 201908 folder, so we're matching locations from the 201908 folder and use that if available."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 212,
   "id": "still-secret",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  3%|▎         | 7263/279638 [00:00<00:07, 36301.36it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['white headed fish eagle']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 279638/279638 [00:09<00:00, 29659.14it/s]\n"
     ]
    }
   ],
   "source": [
    "folder = '201906'\n",
    "\n",
    "seq_06 = []\n",
    "locations_06 = set()\n",
    "num_species_found = 0\n",
    "\n",
    "folders_w_loc = ['Belantikan 1 RAW', 'Belantikan 2 RAW', 'Sungai Wain']\n",
    "\n",
    "for image_path in tqdm(image_paths):\n",
    "    if not image_path.startswith(folder):\n",
    "        continue\n",
    "        \n",
    "    bbox = None\n",
    "    clss = None\n",
    "    if image_path in entries_already_in_db:\n",
    "        bbox = entries_already_in_db[image_path]['images'][0]['bbox']\n",
    "        clss = entries_already_in_db[image_path]['class']\n",
    "    \n",
    "    if image_path == '20190624cameratraps/images/Bawan/BEST ANIMALS ALL/Birds/white headed fish eagle.JPG':\n",
    "        print(clss)\n",
    "    \n",
    "    if clss is None:\n",
    "        candidate_species = set()\n",
    "        for s in combined_species_names:\n",
    "            if s in image_path.lower():\n",
    "                if s in cat_map:\n",
    "                    s = cat_map[s]\n",
    "                candidate_species.add(s)\n",
    "                \n",
    "        if len(candidate_species) > 0:\n",
    "            num_species_found += 1\n",
    "            clss = [s for s in candidate_species]\n",
    "    if clss is None:  # actually all species labels were found in the last round... this will be None\n",
    "        clss = ['__label_unavailable']    # actually all species labels were found in the last round... this will be None\n",
    "    \n",
    "    p_parts = image_path.split('/')\n",
    "\n",
    "    location = 'unknown'\n",
    "#     for loc08 in locations_08:\n",
    "#         if loc08 in image_path[51:]:\n",
    "#             location = loc08\n",
    "    \n",
    "#     locations_06.add(location)\n",
    "    \n",
    "    im = {\n",
    "        'file': image_path,\n",
    "        'frame_num': 1,\n",
    "    }\n",
    "    if bbox is not None:\n",
    "        im['bbox'] = bbox\n",
    "    \n",
    "    seq_06.append({\n",
    "        'dataset': dataset_name,\n",
    "        'seq_id': f'dummy_{folder}_{len(seq_06)}',\n",
    "#         'location': location,\n",
    "        'class': clss,\n",
    "        'images': [im]\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 213,
   "id": "vocational-israeli",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 213,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_species_found"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 214,
   "id": "scenic-designer",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "239046"
      ]
     },
     "execution_count": 214,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(seq_06)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 219,
   "id": "stupid-swing",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'dataset': 'bnf_20190624and0815',\n",
       "  'seq_id': 'dummy_201906_77201',\n",
       "  'class': ['__label_unavailable'],\n",
       "  'images': [{'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/CAMERA TRAPS SABANGAU/CAMERA TRAPS 2011/Camera trap locations 2011/T.1A x Railway/T1A x Railway 13-04-11/CAM N21/Dog 1 8-04-11 T1A+Rlwy Cam N21.JPG',\n",
       "    'frame_num': 1}]},\n",
       " {'dataset': 'bnf_20190624and0815',\n",
       "  'seq_id': 'dummy_201906_173811',\n",
       "  'class': ['__label_unavailable'],\n",
       "  'images': [{'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/Camera traps ALL historical data/CAMERA TRAPS 2011/Camera trap locations 2011/KM 2 x Railway/Km2 x Railway 05-12-11/Cam N9/CDY_0009.JPG',\n",
       "    'frame_num': 1}]},\n",
       " {'dataset': 'bnf_20190624and0815',\n",
       "  'seq_id': 'dummy_201906_64272',\n",
       "  'class': ['__label_unavailable'],\n",
       "  'images': [{'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/CAMERA TRAPS SABANGAU/CAMERA TRAPS 2010/Camera Trap Locations 2010/TO x TC/T.0 x T.C 4-05-10/Cam N 2/CDY_0027.JPG',\n",
       "    'frame_num': 1}]},\n",
       " {'dataset': 'bnf_20190624and0815',\n",
       "  'seq_id': 'dummy_201906_37650',\n",
       "  'class': ['__label_unavailable'],\n",
       "  'images': [{'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/CAMERA TRAPS SABANGAU/CAMERA TRAPS 2009/Camera Trap Locations/T1AxRailway/T1A x Railway 4-12-09/Cam 21/CDY_0012.JPG',\n",
       "    'frame_num': 1}]},\n",
       " {'dataset': 'bnf_20190624and0815',\n",
       "  'seq_id': 'dummy_201906_190319',\n",
       "  'class': ['__label_unavailable'],\n",
       "  'images': [{'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/Camera traps ALL historical data/CAMERA TRAPS 2011/Camera trap locations 2011/T0 x TC/T0 xTC 06-10-11/Cam N1/CDY_0009.JPG',\n",
       "    'frame_num': 1}]},\n",
       " {'dataset': 'bnf_20190624and0815',\n",
       "  'seq_id': 'dummy_201906_154159',\n",
       "  'class': ['__label_unavailable'],\n",
       "  'images': [{'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/Camera traps ALL historical data/CAMERA TRAPS 2009/other stuff/CAMERA TRAPS 2009 up to date/T1AxRailway/T1a x Railway 08-06-09/Cam 15/CDY_0005.JPG',\n",
       "    'frame_num': 1}]},\n",
       " {'dataset': 'bnf_20190624and0815',\n",
       "  'seq_id': 'dummy_201906_200603',\n",
       "  'class': ['__label_unavailable'],\n",
       "  'images': [{'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/Camera traps ALL historical data/CAMERA TRAPS 2012/Camera Trap Locations 2012/T0 x TF/T0 x TF 23-03-12/Cam 2011-12/CDY_0080.JPG',\n",
       "    'frame_num': 1}]},\n",
       " {'dataset': 'bnf_20190624and0815',\n",
       "  'seq_id': 'dummy_201906_152535',\n",
       "  'class': ['empty'],\n",
       "  'images': [{'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/Camera traps ALL historical data/CAMERA TRAPS 2009/other stuff/CAMERA TRAPS 2009 up to date/KM 2 x Railway/Km2 x Railway Download 23-05-09 NOTHING macaque only/CDY_0303.JPG',\n",
       "    'frame_num': 1,\n",
       "    'bbox': []}]},\n",
       " {'dataset': 'bnf_20190624and0815',\n",
       "  'seq_id': 'dummy_201906_46748',\n",
       "  'class': ['sun bear'],\n",
       "  'images': [{'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/CAMERA TRAPS SABANGAU/CAMERA TRAPS 2010/Best of 2010/Best of October/T1AxRailway Sunbear 30-10-2010.JPG',\n",
       "    'frame_num': 1,\n",
       "    'bbox': [{'category': 'animal', 'bbox': [0, 0.6494, 0.4258, 0.289]},\n",
       "     {'category': 'animal', 'bbox': [0.5892, 0.616, 0.2506, 0.2624]}]}]},\n",
       " {'dataset': 'bnf_20190624and0815',\n",
       "  'seq_id': 'dummy_201906_145399',\n",
       "  'class': ['__label_unavailable'],\n",
       "  'images': [{'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/CAMTRAPS Temporary/Photos download/2014/D.Cam Traps April 2014/Km3 x Railway 11-04-2014/Cam S6/Cdy00004.JPG',\n",
       "    'frame_num': 1}]}]"
      ]
     },
     "execution_count": 219,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "{'dataset': 'bnf_20190624and0815',\n",
       " 'seq_id': 'dummy_201906_53718',\n",
       " 'class': ['__label_unavailable'],\n",
       " 'images': [{'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/CAMERA TRAPS SABANGAU/CAMERA TRAPS 2010/Camera Trap Locations 2010/Km5 x Railway/KM 5 14-05-10/Cam N 10/CDY_0014.JPG',\n",
       "   'frame_num': 1}]}"
      ]
     },
     "execution_count": 219,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample(seq_06, 10)\n",
    "seq_06[53718]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "phantom-static",
   "metadata": {},
   "source": [
    "### Both folders combined"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 220,
   "id": "canadian-movie",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "279638"
      ]
     },
     "execution_count": 220,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences = seq_06 + seq_08\n",
    "len(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 221,
   "id": "present-programmer",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path_to_output_temp, 'w', encoding='utf-8') as f:\n",
    "    json.dump(sequences, f, indent=1, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "logical-medicine",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "verbal-territory",
   "metadata": {},
   "source": [
    "### Sample\n",
    "Send all 40,592 images from the 08 folder (which is neater), and then sample 110,000 from the 06 folder. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 228,
   "id": "bearing-importance",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "40592"
      ]
     },
     "execution_count": 228,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "149853"
      ]
     },
     "execution_count": 228,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "im_to_send = [seq['images'][0] for seq in seq_08 if 'empty' not in seq['class']]  # actually we don't have labels \n",
    "len(im_to_send)\n",
    "\n",
    "\n",
    "sample_06 = sample(seq_06, 110000)\n",
    "\n",
    "im_06 = [seq['images'][0] for seq in sample_06 if 'empty' not in seq['class']]\n",
    "\n",
    "im_to_send = im_to_send + im_06\n",
    "len(im_to_send)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 229,
   "id": "contrary-jacob",
   "metadata": {},
   "outputs": [],
   "source": [
    "bnf_list_to_download = [im['file'] + '\\n' for im in im_to_send]\n",
    "\n",
    "with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/batch_12_lists/bnf_files.txt', 'w') as f:\n",
    "    f.writelines(bnf_list_to_download)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "constant-assist",
   "metadata": {},
   "source": [
    "## Rename and copy to imerit12f folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 240,
   "id": "municipal-anaheim",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "149853"
      ]
     },
     "execution_count": 240,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "279638"
      ]
     },
     "execution_count": 240,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "'bnf_20190624and0815'"
      ]
     },
     "execution_count": 240,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "'/mink_disk_0/camtraps/bnf/'"
      ]
     },
     "execution_count": 240,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(im_to_send)\n",
    "len(sequences)\n",
    "dataset_name\n",
    "container_root"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 232,
   "id": "uniform-simulation",
   "metadata": {},
   "outputs": [],
   "source": [
    "im_to_seq_id = {}\n",
    "for seq in sequences:\n",
    "    for im in seq['images']:\n",
    "        im_to_seq_id[im['file']] = seq['seq_id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 242,
   "id": "brave-blake",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 619 ms, sys: 8 ms, total: 627 ms\n",
      "Wall time: 626 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "('/mink_disk_0/camtraps/bnf/bnf/20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/CAMTRAPS Temporary/Photos download/2012/Cam Traps Oct 2012/T0 x TC 19-10-12/Cam N1/CDY_0037.JPG',\n",
       " '/mink_disk_0/camtraps/imerit12f/bnf_20190624and0815.seqdummy_201906_132338.frame1.jpg')"
      ]
     },
     "execution_count": 242,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "path_pairs = []\n",
    "\n",
    "for im in im_to_send:\n",
    "    src_path = os.path.join(container_root, im['file'])\n",
    "    seq_id = im_to_seq_id[im['file']]\n",
    "    frame = im['frame_num']\n",
    "    dst_path = os.path.join('/mink_disk_0/camtraps/imerit12f', \n",
    "                            f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')\n",
    "    path_pairs.append((src_path, dst_path))\n",
    "\n",
    "path_pairs[-50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 243,
   "id": "alike-crash",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "149853"
      ]
     },
     "execution_count": 243,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(path_pairs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 244,
   "id": "lightweight-penguin",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4min 51s, sys: 9min 21s, total: 14min 12s\n",
      "Wall time: 15min 10s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "def copy_file(src_path, dst_path):\n",
    "    return copyfile(src_path, dst_path)\n",
    "\n",
    "with ThreadPool(16) as pool:\n",
    "    dst_paths = pool.starmap(copy_file, path_pairs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 245,
   "id": "strategic-israeli",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "149853"
      ]
     },
     "execution_count": 245,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dst_paths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "looking-hollow",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cameratraps] *",
   "language": "python",
   "name": "conda-env-cameratraps-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
